# -*- coding: utf-8 -*-
"""
------------------------------------------------------------------------------
    File Name:  SpliderMan
    Author   :  wanwei1029
    Date     :  2018/5/17
    Desc     :
------------------------------------------------------------------------------
"""
from UrlManager import *
from BaikeHtmlParser import *
from DataOutput import *
from HtmlDownLoader import *


class SpiderMan(object):

    def __init__(self):
        self.manager = UrlManager()
        self.downloader = HtmlDownLoader()
        self.parser = BaikeHtmlParser()
        self.output = DataOutput()

    def crwal(self, root_url):
        self.manager.add_new_url(root_url)
        while( self.manager.has_new_url() and self.manager.old_url_size() < 100 ):
            try:
                new_url = self.manager.get_new_url()
                html = self.downloader.download(new_url)
                new_urls, data = self.parser.parser(new_url, html)
                self.manager.add_new_urls(new_urls)
                self.output.store_data(data)
                print("已经抓取了%s个链接"%self.manager.old_url_size())
            except Exception :
                print("failed")
        self.output.output_html()

if __name__ == "__main__":
    spider_man = SpiderMan()
    root_url = "https://baike.baidu.com/item/%E7%BD%91%E7%BB%9C%E7%88%AC%E8%99%AB/5162711"
    spider_man.crwal(root_url)
